Method

Load required packages:

library(sf) # spatial data manipulation
## Linking to GEOS 3.8.0, GDAL 2.4.2, PROJ 6.3.0
library(data.table) # fast data manipulation
library(leaflet) # easy mapping in R
library(ggplot2) # plotting
library(rpart); library(rpart.plot) # decision tree models
library(MASS); library(pscl) # Poisson and zero-inflated regression
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
source("R/summary_plots.R") # function for creating summary plots

options(scipen = 99) # disable scientific notation

Load Data

Road Segments and Traffic: A shapefile of Pennsylvania traffic data has been prepared from PennDOT’s RMSTRAFFIC data, retaining only primary state routes. The shapefile is read into R as a spatial object using st_read function from the sf package:

segments <- st_read("shp/traffic/traffic.shp")
## Reading layer `traffic' from data source `/Users/markegge/projects/lighting_safety_r_demo/shp/traffic/traffic.shp' using driver `ESRI Shapefile'
## Simple feature collection with 15866 features and 8 fields (with 1 geometry empty)
## geometry type:  MULTILINESTRING
## dimension:      XY
## bbox:           xmin: 1191414 ymin: 141037.8 xmax: 2807622 ymax: 1075345
## CRS:            2272
segments$id <- 1:nrow(segments) # assign a unique identifier for each segment

Crashes: Having downloaded the PA Crash data files for 2012 – 2016 (five years is the typical period of analysis for safety investigations), the CSV files are loaded into R using the fread function from the data.table package. Compared with read.csv, fread is much faster, and has sensible defaults (e.g. stringsAsFactors = FALSE). A list object with each year’s data is populated, then the rbindlist from data.table binds together the rows from each object in the list.

crashes <- list()
for(year in as.character(2012:2016)) {
  crashes[[year]] <- fread(paste0("data/Statewide_", year, "/CRASH_", year, "_Statewide.csv"))
}
crashes <- rbindlist(crashes) # join the five files into one

The result is a dataset called crashes with 627180 observations.

We then run the provided summary_plots function to generate a PDF with a descriptive plot of each attribute in the dataset.

## Plots generated to  out/plots.pdf .

Take a moment to inspect the resulting file out/plots.pdf:

  • What trends do you see in terms of the seasonality of crashes?
  • What data cleaning step would you need to take if you were analyzing crashes by Hour of Day?

Finally, filter our crashes to those occurring at night, and create a “crash_lighting” attribute, based on the metadata in docs/PA_Crashes_Data_Dictionary.pdf:

# Filter to only crashes with "2 - Dark - No Street Lights" or "3 - Dark - Street Lights"
crashes <- crashes[ILLUMINATION %in% c(2, 3)]
crashes[ILLUMINATION == 2, crash_lighting := "unlit"]
crashes[ILLUMINATION == 3, crash_lighting := "lit"]

Spatially join crashes to road segments

Next, we spatially join crashes to road segments by buffering the road segments by 50 feet and then performing a spatial join.

Note, for a spatial join to execute property, both the source and destination layers must be in the same spatial projection. In this case, we use the EPSG:2272 NAD83 / Pennsylvania South (ft) projection, which measures distances in feet.

The st_as_sf function from the sf package creates a spatial object from the crashes dataset based on the latitude and longitude values in the “DEC_LAT” and “DEC_LONG” fields.

crashes <- crashes[!(is.na(DEC_LONG) | is.na(DEC_LAT))] # keep only records with x, y values
crashes <- st_as_sf(crashes, coords = c("DEC_LONG", "DEC_LAT"), crs = 4326) # create sf spatial object
crashes <- st_transform(crashes, 2272) # reproject data to EPSG:2272

# buffer road segments by 50 feet
segment_buffer <- st_buffer(segments, dist = 50, endCapStyle = "FLAT")

# spatially join crashes to road segment data to crashes
joined <- st_join(segment_buffer, crashes, left = FALSE) # inner join

Finally, we create a new data.table of the aggregate crash counts by road segment, and merge this new attribute back to our segments data:

segment_crashes <- as.data.table(st_drop_geometry(joined)) # convert sf to data.table

crash_counts <- segment_crashes[, .(annual_crashes = .N / 5), by = id] # count annual crashes by segment

# merge crash counts back to segments
segments <- merge(segments, crash_counts, by = "id", all.x = TRUE)

# calculate crash rates - crashes per million VMT
segments[is.na(segments$annual_crashes), ]$annual_crashes <- 0
segments$mvmt <- with(segments, (DLY_VMT * 365) / 1000000) # million annual vehicle miles travelled
segments$rate <- with(segments, annual_crashes / mvmt) # annual crashes per 1m annual vmt

Segment Illumination

Our roadway dataset does not include information about overhead lighting. The section below uses crash data to impute a “lighting” attribute for our road segments, and then uses a decision tree classifier to assign lighting to unknown segments based on known segments.

# count total crashes by segment id and illumination condition
counts <- segment_crashes[, .(crash_count = .N), by = .(id, crash_lighting)]

segment_counts <- dcast(counts, id ~ crash_lighting,  # long to wide transform
                      value.var = "crash_count", fill = 0)         # filling missing values with zero

# assign segment_lighting based on majority of crashes (lit or unlit)
segment_counts[, segment_lighting := ifelse(lit >= unlit, "lit", "unlit")] 

# join the imputed roadway lighting condition to all road segments
light <- merge(as.data.table(st_drop_geometry(segments)), # create data.table from segments attributes
               segment_counts[, .(id, segment_lighting)], # join result, keeping only segment_lighting field
               by = "id", all.x = TRUE) # join by id, keeping all segments

# Table of "lit", "unlit", and "unknown" lighting_conditon attributes
table(light$segment_lighting, useNA = "ifany")
## 
##   lit unlit  <NA> 
##  6081  7452  2333

For segments with unknown lighting_condition, predict which road segments are illuminated using an rpart decision tree:

fit <- rpart(segment_lighting ~ ST_RT_NO + CTY_CODE + DISTRICT_N + SEG_LNGTH_ + CUR_AADT + DLY_VMT, data = light)

# create a confusion matrix to evaluate classifier performance
actual <- light[!is.na(segment_lighting), segment_lighting]
predicted <- predict(fit, type = "class")
(cm <- as.matrix(table(Actual = actual, Predicted = predicted))) # create the confusion matrix
##        Predicted
## Actual   lit unlit
##   lit   4624  1457
##   unlit 1539  5913
accuracy <- sum(diag(cm)) / sum(cm) # number of correctly classified instances per class  / number of instances
p <- rowSums(cm) / sum(cm); q <- colSums(cm) / sum(cm)
kappa <- (accuracy - sum(p * q)) / (1 - sum(p * q)) # kappa score indicates moderate agreement
cat("Model is", round(accuracy * 100), "% accurate with a Kappa score of", round(kappa, 2))
## Model is 78 % accurate with a Kappa score of 0.55

Our decision tree is reasonably accurate (~75%) and the Kappa score (0.55) indicates moderate evidence that our model outperforms a naïve approach of simply assigning all attributes to the majority class.

# use decision tree to fill in missing lighting values
predictions <- predict(fit, type = "class", newdata = light[is.na(segment_lighting)])
light[is.na(segment_lighting)]$segment_lighting <- predictions

# Updated table of "lit", "unlit", and "unknown" lighting_conditon attributes
table(light$segment_lighting, useNA = "ifany")
## 
##   lit unlit 
##  7413  8453
# join lighting attribute back to road segments
segments <- merge(segments, light[, .(id, lighting = segment_lighting)], by = "id", all.x = TRUE)

Visualize the results:

# limit map data to Pittsburgh region and reproject to web mercator for leaflet
leaflet_segments <- st_transform(segments[segments$DISTRICT_N %in% c("11", "12"), ], 4326)

pal <- colorFactor(c("yellow", "brown", "orange"), leaflet_segments$lighting)
leaflet(data = leaflet_segments) %>% 
  addProviderTiles(providers$CartoDB.DarkMatter) %>% 
  addPolylines(color = ~pal(lighting)) %>%
  addLegend(pal = pal, values = ~lighting)

Exploratory Data Analysis

Exploratory data analysis allows the analyst to become more familiar with the data and informs the selection of subsequent modelling or data visualization steps.

# convert spatial object to data.table for easy data manipulation
DT <- as.data.table(st_drop_geometry(segments))
# DT <- DT[!is.na(rate)] # one weird segment has no geom or rate

# plot crash counts vs. vmt
# relationship between crashes and vmt seems fairly linear
ggplot(DT, aes(x = mvmt, y = annual_crashes)) +
  geom_point() + ggtitle("Crash Count vs. VMT")

Crash Rates:

# map crash rates
qpal <- colorQuantile(palette = "YlOrRd", domain = leaflet_segments$rate, n = 6)
leaflet(data = leaflet_segments) %>% 
  addProviderTiles(providers$CartoDB.DarkMatter) %>% 
  addPolylines(color = ~qpal(rate)) %>%
  addLegend(pal = qpal, values = ~rate, opacity = 1)
# some descriptive plots
hist(DT$rate)

hist(log(DT$rate))

hist(DT[rate < 50]$rate, breaks = 50)

Decision trees are useful tools to determine which dataset attributes our predictive of our outcome variables of interest

# What explains / predicts crash counts? A decision tree:
fit <- rpart(annual_crashes ~ DISTRICT_N + SEG_LNGTH_ + CUR_AADT + DLY_VMT,
             data = DT)
rpart.plot(fit)

# What explains / predicts crash rates?
fit <- rpart(rate ~ DISTRICT_N + SEG_LNGTH_ + CUR_AADT + DLY_VMT, data = DT)
rpart.plot(fit)

# box plot of crash rates
boxplot(rate ~ lighting, data = DT[rate < 2])

# are lighting and AADT related?
cor(DT$DLY_VMT, DT$lighting == "lit") # Yes - weak negative correlation
## [1] -0.1467884
hist(DT$mvmt) # VMT has an exponential distribution

hist(DT$annual_crashes) # annual crashes has similar distribution

hist(log(DT$mvmt))

Modeling

Having completed our exploratory data analysis, we see that there’s a reliable relationship between crash counts and VMT. To estimate the lighting relationship, we’ll fit a regression model controlling for VMT.

Count data is typically Poisson or Negative Binomially distributed, so we’ll use an appropriate regression model.

DT[, total_crashes := as.integer(annual_crashes * 5)] # total 5 year crashes for count model (whole numbers)

# fit a Poisson regression model
fit <- glm(total_crashes ~ lighting + mvmt, data = DT, family = poisson)
summary(fit)
## 
## Call:
## glm(formula = total_crashes ~ lighting + mvmt, family = poisson, 
##     data = DT)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -36.333   -2.399   -1.120    0.743   29.790  
## 
## Coefficients:
##                 Estimate Std. Error z value            Pr(>|z|)    
## (Intercept)    2.0002022  0.0040671  491.80 <0.0000000000000002 ***
## lightingunlit -0.3263883  0.0056339  -57.93 <0.0000000000000002 ***
## mvmt           0.0395376  0.0001177  335.87 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 181931  on 15865  degrees of freedom
## Residual deviance: 128528  on 15863  degrees of freedom
## AIC: 176752
## 
## Number of Fisher Scoring iterations: 5
# goodness of fit measure:
1 - pchisq(summary(fit)$deviance, summary(fit)$df.residual) # model does not fit the data (p < 0.05)
## [1] 0
# fit a negative binomial regression model
fit <- MASS::glm.nb(total_crashes ~ lighting + mvmt, data = DT)
summary(fit)
## 
## Call:
## MASS::glm.nb(formula = total_crashes ~ lighting + mvmt, data = DT, 
##     init.theta = 1.132717284, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.6615  -0.9306  -0.2972   0.3524   5.2944  
## 
## Coefficients:
##                Estimate Std. Error z value            Pr(>|z|)    
## (Intercept)    1.650005   0.012225  134.97 <0.0000000000000002 ***
## lightingunlit -0.466441   0.016618  -28.07 <0.0000000000000002 ***
## mvmt           0.110142   0.001062  103.67 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(1.1327) family taken to be 1)
## 
##     Null deviance: 25897  on 15865  degrees of freedom
## Residual deviance: 17943  on 15863  degrees of freedom
## AIC: 92333
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.1327 
##           Std. Err.:  0.0157 
## 
##  2 x log-likelihood:  -92324.9520
# goodness of fit measure:
1 - pchisq(summary(fit)$deviance, summary(fit)$df.residual) # model does not fit the data
## [1] 0
# fit a zero inflated negative binomial model
fit <- pscl::zeroinfl(total_crashes ~ lighting + mvmt | mvmt, data = DT)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(fit)
## 
## Call:
## pscl::zeroinfl(formula = total_crashes ~ lighting + mvmt | mvmt, data = DT)
## 
## Pearson residuals:
##      Min       1Q   Median       3Q      Max 
## -26.5203  -1.1359  -0.6836   0.4483  49.9393 
## 
## Count model coefficients (poisson with log link):
##                 Estimate Std. Error z value            Pr(>|z|)    
## (Intercept)    2.1916951  0.0040869  536.28 <0.0000000000000002 ***
## lightingunlit -0.3935126  0.0056469  -69.69 <0.0000000000000002 ***
## mvmt           0.0370332  0.0001227  301.72 <0.0000000000000002 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)  0.14101    0.03981   3.542             0.000397 ***
## mvmt        -1.98016    0.06056 -32.698 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 9 
## Log-likelihood: -7.621e+04 on 5 Df
# what does this look like?
synth <- data.table(lighting = c(rep("lit", 100), rep("unlit", 100)),
                    mvmt = seq(0.1, 70, length.out = 100))
fit_aadt <- lm(CUR_AADT ~ poly(mvmt, 3), data = DT)
synth$CUR_AADT <- predict(fit_aadt, newdata = synth)
synth$total_crashes <- predict(fit, newdata = synth)
ggplot(synth, aes(x = mvmt, y = total_crashes, color = lighting)) + 
  geom_line() +
  ggtitle("Estimated Crashes and VMT", "By Lighting Condition")

Conclusion

The zero-inflated negative binomial model outputs show that unlit road segments have 0.39 fewer crashes on average than lit segments, controlling for VMT.

This finding is rather counter-intuitive from a “lighting improves safety” perspective, but makes sense if lighting is installed in locations with higher crash risk (e.g. intersections).